**************************************************************
***** Name:        do_merge_indiv.do	      		     *****
***** Description: This do file creates data at the		 *****
***** individual level.									 *****
**************************************************************

clear all
global path "C:\Users\patry\OneDrive\Desktop\Mario replication files"
cd "$path"
set more off

* 1.0 Use ingredient data (individual-round)

use "Data/raw data/indiv_ingred.dta", clear

* 2.0 Merge survey data (individual-phase)

merge 1:1 team_id player round using "Data/raw data/indiv_survey.dta"

/* Notes:

 We are merging from individual-round to individual-phase. I stored the survey
 data in the last round of each phase, so round 6 has the phase 2 survey data
 and round 12 has the phase 3 survey data.
 
 1) _merge == 1 We have individual ingredients but no survey data
 
 There are 2,220 missing:
 
 * 2,200 are because the survey data is missing for all rounds except 6 and 12.
   tab round if _merge==1
 * 20 are missing in round 12 because the AI player has no survey data.
   tab player if _merge==1 & round==12 
 
 2) _merge == 2 We have survey data but no individual ingredients
 
 There are 26 missing:
 
 * 26 are missing because we have no individual performance scores for players 5
   and 6 (we did not record the screens in that room)
   tab round player if _merge==2
   
 
 Note: The dataset only contains the observations for the other room for round 6.
 This is because we don't have individual performance for these (we didn't record their
 room). Instead, these observations just contain their survey responses.

*/


* Fill in missing values so dataset is balanced
* 2.1 Cleaning

rename _merge _merge_survey
sort team_id round player

* 2.2 Extra room identifiers (we have no individual
* 	  ingredients for these or difficulty-level
*     controls. We exclude them from the analysis.

gen extraroom = 0
replace extraroom = 1 if inrange(round,1,6) & inlist(player,5,6)
order extraroom, after(newhire_spillover)
la var extraroom "Participants in extra room (#5 & 6)"

* 3.0 Merge skills data (individual)

merge m:1 team_id player using "Data/raw data/indiv_phase1.dta"

/* Notes:

 We are merging from individual-round to individual. 
 
 1) _merge == 1 We have individual ingredients and/or survey data but no phase 1 performance 
 
 There are 120 missing:
 
 * All 120 are AI characters, and we impute their phase 1 scores below. We have individual
 ingredients for the AI characters but no survey data.
 
*/

rename _merge _merge_phase1
sort team_id round player 

* 4. Tidy up

* 4.1 Label

label var team_id "Team ID"
label var round "Round"
label var player "Player #"
label var ingred "Ingredients"
label var tacit "Tacit condition"
label var explicit "Explicit condition"
label var ai "AI condition"
label var newhire "New hire condition"
label var control "Control condition"

* 4.2 Create blockers for variables

* 4.2.1 Team identifers
gen IDENTIFIERS = . 
label var IDENTIFIERS "************************************"
order IDENTIFIERS, before(team_id)

* 4.2.2 Ingredients identifiers
order ingred, after(extraroom)
gen PERFORMANCE = .
label var PERFORMANCE "************************************"
order PERFORMANCE, before(ingred)
 
* 4.2.3 Treatment identifiers
gen TREATMENT = . 
label var TREATMENT "************************************"
order TREATMENT, before(tacit)

* 4.2.4 Survey
gen SURVEY = . 
label var SURVEY "************************************"
order SURVEY, before(talkcoordinate)

* 4.2.5 Phase 1 scoress
gen SKILLS = . 
label var SKILLS "************************************"
order SKILLS, before(game1)

* 5. Cleaning

* 5.1 Phase 1 scores

* 5.1.1 Impute phase 1 scores for AI

replace game1 = 33.99 if player==0
replace game2 = 28.58 if player==0
replace game3 = 17.37 if player==0
replace game4 = 23 if player==0

* 5.1.2 Normalize game 1 scores

forv g = 1/4 {
	qui sum game`g' if round==1
	qui gen z_game`g' = (game`g' - r(mean))/r(sd) 
	la var z_game`g' "Game `g', z-score"
}

* 5.1.3 Average game score

gen z_game = -z_game1 + -z_game2 + -z_game3 + z_game4
qui sum z_game if round==1
qui replace z_game = (z_game - r(mean))/r(sd) 
la var z_game "Average skill from games, z-score"


* 5.2 Phase indicators

gen phase2 = (round<=6)
label var phase2 "Phase 2 (Rounds 1-6)"
gen phase3 = (round>=7)
label var phase3 "Phase 3 (Rounds 7-12)"


* 5.3 Pair identifiers (for round-difficulty merge)

* 5.3.1 Pair identifiers

* Create pair identifiers to merge with team-level dataset
* 1 = left, 2 = right, . = extra room (players 5 & 6 in phase 2)

gen pair = 1 if extraroom==0
la var pair "Pair"

replace pair = 2 if tacit==1    & inlist(player,3,4)
replace pair = 2 if explicit==1 & inlist(player,3,4) & inlist(round,1,4)
replace pair = 2 if explicit==1 & inlist(player,2,4) & inlist(round,2,5)
replace pair = 2 if explicit==1 & inlist(player,2,3) & inlist(round,3,6)

replace pair = 2 if explicit==1 & control==1 & inlist(player,3,4) & inlist(round,7,10)
replace pair = 2 if explicit==1 & control==1 & inlist(player,2,4) & inlist(round,8,11)
replace pair = 2 if explicit==1 & control==1 & inlist(player,2,3) & inlist(round,9,12)

replace pair = 2 if explicit==1 & ai==1 & inlist(player,0,4) & inlist(round,7,10)
replace pair = 2 if explicit==1 & ai==1 & inlist(player,0,3) & inlist(round,8,11)
replace pair = 2 if explicit==1 & ai==1 & inlist(player,3,4) & inlist(round,9,12)

replace pair = 2 if explicit==1 & newhire==1 & inlist(player,3,4) & inlist(round,7,10)
replace pair = 2 if explicit==1 & newhire==1 & inlist(player,2,4) & inlist(round,8,11)
replace pair = 2 if explicit==1 & newhire==1 & inlist(player,2,3) & inlist(round,9,12)

replace pair = 3 if pair == . 

* NOTE: The dataset does not have pair 3 data in phase 2 except for round 6 (where they
* complete the survey). This is because we did not record their screens. However, we
* have their skills data + survey responses.

* 6. Merge in difficulty here

 merge m:1 team_id round pair using "Data/temp data/indiv_difficulty.dta", 
 
 
/* Notes:

 We are merging from individual-round to individual. 
 
 1) _merge == 1 We have no difficulty data 
 
 There are 6 missing:
 
 * All 6 are from the extra room where we did not collect difficulty data.
 tab player round if _merge==1
 
 2) _merge == 2 We have difficulty data but no performance/survey/skill data.
 
 * We can drop these. They are the correspsonding phase 1 observations from round 1 to 5 for the
 extra room. We don't use them in the analysis, so we can drop them
 
 
*/

drop if _merge==2 
rename _merge _merge_diff
drop _merge*

* 7. Skills analysis

* 7.1 Create skills measure 
* Predict phase 2 performance using skills plus interaction

preserve


	keep if phase2==1 | player==0

	collapse (mean) ingred z_game1 z_game2 z_game3 z_game4 z_game, by(team_id player)


	* generate interactions between games
	foreach g1 in 1 2 3 4 {
		foreach g2 in 1 2 3 4 {
			gen X_game`g1'`g2' = z_game`g1' * z_game`g2'
		}
	}

	* Predict skills 
	reg ingred z_game1 z_game2 z_game3 z_game4 X_game*, r cluster(team_id)
	predict skills
	
	qui sum skills
	qui replace skills = (skills-r(mean))/r(sd)
	la var skills "Skill index"
	
	keep team_id player skills z_game

	sort skills
		
	tempfile skills
	save "`skills'"


restore


merge m:1 team_id player using "`skills'"


* 7.2 generate player 1 skill/ingredient score

gen phase2score_p1 = .
gen z_game_p1 = .

levelsof team_id, local(teams)
foreach t in `teams' {
	qui sum ingred if player==1 & phase2==1 & team_id=="`t'"
	qui replace phase2score_p1 = r(mean) if phase3==1 & team_id=="`t'"
	
	qui sum z_game if player==1 & phase2==1 & team_id=="`t'"
	qui replace z_game_p1 = r(mean) if phase3==1 & team_id=="`t'"

}

* 7.2 generate player 2/3/4 skill/ingredient score

foreach n in 2 3 4 6 {

	gen phase2score_p`n'= .
	gen z_game_p`n'= .

	levelsof team_id, local(teams)
	foreach t in `teams' {
		qui sum ingred if player==`n' & phase2==1 & team_id=="`t'"
		qui replace phase2score_p`n'= r(mean) if phase3==1 & team_id=="`t'"
		
		qui sum z_game if player==`n' & phase2==1 & team_id=="`t'"
		qui replace z_game_p`n' = r(mean) if phase3==1 & team_id=="`t'"

	}
}



* 7.3 save measure at team level.

preserve

	collapse (mean) skills_mean = skills (min) skills_min = skills (max) skills_max = skills (mean) z_game_mean= z_game (min) z_game_min = z_game (max) z_game_max = z_game ///
	 (mean) phase2score_p* z_game_p*, by(team_id round pair)
	sort team_id pair round

	la var skills_mean "Mean skills on team"
	la var skills_max "Max skills on team"
	la var skills_min "Min skills on team"
	
	save "Data/temp data/team_skills.dta", replace

restore


* 8. Cleaning

* 8.1 Sample indicator

gen sample = extraroom!=1
la var sample "Experimental sample"

* 8.2 Phase 2 performance

levelsof team_id , local(teams)

gen phase2score = .

foreach t in `teams' {
	
	foreach p in 1 2 3 4 5 6 {
	
		qui sum ingred if phase2==1 & team_id=="`t'" & player==`p'
		qui replace phase2score = r(mean) if  phase3==1 & team_id=="`t'" & player==`p'
		
	
	}
	
} 


* AI score average from 50 rounds

replace phase2score = 7.22 if player==0

* Merge in phase2score for player 6 (For the new hires, we don't have their individual 
* performance in phase 2, so we impute half the average team score.

drop _merge
merge m:1 team_id player using "Data/raw data/indiv_p6_phase2score.dta"

replace phase2score = phase2score_merge if player==6
drop phase2score_merge _merge

* 8.3 Missing performance data

* For 102411 (P3), 112203 (R7), & 112501 (P3), we inpute half team score for individual scores since the video was lost


merge m:1 team_id pair round using  "Data/temp data/teamscoremerge.dta", nogen
replace ingred = .5 * totalingred if missing(ingred) & inlist(team_id,"102411","112203","112501")


* 8.4 Merge in team score levels
* merge m:1 team_id pair round using  "Data/temp data/teamscoremerge.dta", nogen

* 9. Save

sort team_id round player



qui gen phase2score_indiv = .

levelsof team_id, local(teams)
foreach t in `teams' {
	foreach p in 1 2 3 4  {
		qui sum ingred if team_id=="`t'" & player==`p' & phase2==1
		qui replace phase2score_indiv = r(mean) if team_id=="`t'" & player==`p' & phase3==1
	}
}
* ai = replace average from simulations
replace phase2score_indiv = 8.52     if player==0 & phase3==1


* Clean up and label
foreach i in 1 2 3 4 6 {
	label var phase2score_p`i' "Average phase 2 score, Player `i'"
	label var z_game_p`i' "Average skill, Player `i'"
}
la var phase2score "Average phase 2 score"
la var phase2score_indiv "Average phase 2 score"

save "Data/clean data/clean_indiv.dta", replace


* 10. Save individual ingredients for phase 2 average performance

keep team_id player round ingred


keep if round<=6
keep if inrange(player,1,4)

collapse (mean) ingred, by(team_id player)

rename player player_1
rename ingred ingred_p1_phase2
label var ingred_p1_phase2 "Average ingredients, phase 2"

save "Data/temp data/indivingred_control_player1.dta", replace

rename player_1 player_2
save "Data/temp data/indivingred_control_player2.dta", replace



exit


